Code
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings;
'ignore'); warnings.filterwarnings(
kakamana
January 6, 2023
Lets explore distribution functions pdf and cdf using Iris data set
sepal_length | sepal_width | petal_length | petal_width | type | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type'], dtype='object')
Iris-setosa 50
Iris-versicolor 50
Iris-virginica 50
Name: type, dtype: int64
1.5 14
1.4 12
1.3 7
1.6 7
1.7 4
1.2 2
1.9 2
1.1 1
1.0 1
Name: petal_length, dtype: int64
For cross-referencing
<AxesSubplot:xlabel='petal_length', ylabel='Density'>
counts, bin_edges = np.histogram(setosa['petal_length'], bins=10,
density = False)
print("histogram counts\n",counts)
pdf = counts/(sum(counts))
print("Sum of count is\n",sum(counts))
print("bin edges \n",bin_edges)
print("PDF below \n",pdf);
plt.gca().legend(('Pdf'))
plt.title('PDF and PDF For iris_setosa')
plt.xlabel("Petal length")
plt.ylabel("Percentage")
plt.plot(bin_edges[1:],pdf)
histogram counts
[ 1 1 2 7 12 14 7 4 0 2]
Sum of count is
50
bin edges
[1. 1.09 1.18 1.27 1.36 1.45 1.54 1.63 1.72 1.81 1.9 ]
PDF below
[0.02 0.02 0.04 0.14 0.24 0.28 0.14 0.08 0. 0.04]
[<matplotlib.lines.Line2D at 0x1a5ae102100>]
counts, bin_edges = np.histogram(setosa['petal_length'], bins=10,
density = False)
print("histogram counts\n",counts)
pdf = counts/(sum(counts))
print("Sum of count is\n",sum(counts))
print("bin edges \n",bin_edges)
print("PDF is below \n",pdf);
cdf = np.cumsum(pdf)
print("CDF is below\n",cdf)
plt.gca().legend(('Cdf'))
plt.title('CDF For iris_setosa')
plt.xlabel("Petal length")
plt.ylabel("Percentage")
plt.plot(bin_edges[1:],cdf)
histogram counts
[ 1 1 2 7 12 14 7 4 0 2]
Sum of count is
50
bin edges
[1. 1.09 1.18 1.27 1.36 1.45 1.54 1.63 1.72 1.81 1.9 ]
PDF is below
[0.02 0.02 0.04 0.14 0.24 0.28 0.14 0.08 0. 0.04]
CDF is below
[0.02 0.04 0.08 0.22 0.46 0.74 0.88 0.96 0.96 1. ]
[<matplotlib.lines.Line2D at 0x1a5ae1721c0>]
counts, bin_edges = np.histogram(setosa['petal_length'], bins=10,
density = True)
print(counts)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges)
#compute CDF
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
plt.gca().legend(('Pdf','Cdf'))
plt.title('PDF and CDF For iris_setosa')
plt.xlabel("Petal length")
plt.ylabel("Percentage")
plt.show();
[0.22222222 0.22222222 0.44444444 1.55555556 2.66666667 3.11111111
1.55555556 0.88888889 0. 0.44444444]
[0.02 0.02 0.04 0.14 0.24 0.28 0.14 0.08 0. 0.04]
[1. 1.09 1.18 1.27 1.36 1.45 1.54 1.63 1.72 1.81 1.9 ]
counts, bin_edges = np.histogram(setosa['petal_length'], bins=10,
density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges);
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf);
plt.plot(bin_edges[1:], cdf)
counts, bin_edges = np.histogram(setosa['petal_length'], bins=20,
density = True)
pdf = counts/(sum(counts))
plt.plot(bin_edges[1:],pdf);
plt.gca().legend(('Pdf','Cdf','bin edges'))
plt.title('PDF and CDF For iris_setosa')
plt.xlabel("Petal length")
plt.ylabel("Percentage")
plt.show();
[0.02 0.02 0.04 0.14 0.24 0.28 0.14 0.08 0. 0.04]
[1. 1.09 1.18 1.27 1.36 1.45 1.54 1.63 1.72 1.81 1.9 ]
counts, bin_edges = np.histogram(setosa['petal_length'], bins=10,
density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
# virginica
counts, bin_edges = np.histogram(virginica['petal_length'], bins=10,
density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
#versicolor
counts, bin_edges = np.histogram(versicolor['petal_length'], bins=10,
density = True)
pdf = counts/(sum(counts))
print(pdf);
print(bin_edges)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
plt.title('PDF and CDF For iris_versicolor')
plt.xlabel("Petal length")
plt.ylabel("Percentage")
plt.show();
[0.02 0.02 0.04 0.14 0.24 0.28 0.14 0.08 0. 0.04]
[1. 1.09 1.18 1.27 1.36 1.45 1.54 1.63 1.72 1.81 1.9 ]
[0.02 0.1 0.24 0.08 0.18 0.16 0.1 0.04 0.02 0.06]
[4.5 4.74 4.98 5.22 5.46 5.7 5.94 6.18 6.42 6.66 6.9 ]
[0.02 0.04 0.06 0.04 0.16 0.14 0.12 0.2 0.14 0.08]
[3. 3.21 3.42 3.63 3.84 4.05 4.26 4.47 4.68 4.89 5.1 ]
#Mean, Variance, Std-deviation,
print("Means:")
print(np.mean(setosa["petal_length"]))
#Mean with an outlier.
print(np.mean(np.append(setosa["petal_length"],50)));
print(np.mean(virginica["petal_length"]))
print(np.mean(versicolor["petal_length"]))
print("\nStd-dev:");
print(np.std(setosa["petal_length"]))
print(np.std(virginica["petal_length"]))
print(np.std(versicolor["petal_length"]))
Means:
1.464
2.4156862745098038
5.5520000000000005
4.26
Std-dev:
0.17176728442867112
0.546347874526844
0.4651881339845203
#Median, Quantiles, Percentiles, IQR.
print("\nMedians:")
print(np.median(setosa["petal_length"]))
#Median with an outlier
print(np.median(np.append(setosa["petal_length"],50)));
print(np.median(virginica["petal_length"]))
print(np.median(versicolor["petal_length"]))
print("\nQuantiles:")
print(np.percentile(setosa["petal_length"],np.arange(0, 100, 25)))
print(np.percentile(virginica["petal_length"],np.arange(0, 100, 25)))
print(np.percentile(versicolor["petal_length"], np.arange(0, 100, 25)))
print("\n90th Percentiles:")
print(np.percentile(setosa["petal_length"],90))
print(np.percentile(virginica["petal_length"],90))
print(np.percentile(versicolor["petal_length"], 90))
from statsmodels import robust
print ("\nMedian Absolute Deviation")
print(robust.mad(setosa["petal_length"]))
print(robust.mad(virginica["petal_length"]))
print(robust.mad(versicolor["petal_length"]))
Medians:
1.5
1.5
5.55
4.35
Quantiles:
[1. 1.4 1.5 1.575]
[4.5 5.1 5.55 5.875]
[3. 4. 4.35 4.6 ]
90th Percentiles:
1.7
6.31
4.8
Median Absolute Deviation
0.14826022185056031
0.6671709983275211
0.5189107764769602